# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# File Name          : _function_analysis.R
# Programmer Name    : Luis Campos
#                     luiscampos@g.harvard.edu
#
# Purpose            : contains function for analysis of a multi-level 
#						treatment and numeric outcome
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Replication Information:
#   We do expect differences when running this code on different systems 
#   (and with different seeds), but the differences all tend to be quite 
#   small and the general trends remain. If you would like to replicate 
#   the exact numbers and figures used in the article, in the readme you 
#   will find the session info. Please install and load all package 
#   versions (including the R version) to ensure the exact replication. 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #


do.analysis = function(dat = NULL, w = NULL, treatment = NULL, t0 = NULL, t1= NULL, outcome = NULL, sub_var = NULL, sub_level = NULL, B = 2000, return.all = TRUE){

	# keep only desired comparison
	keep.t = dat[,treatment] %in% c(t0, t1)
	# keep only desired subset
	keep.v = dat[,sub_var] == sub_level

	dat = dat[keep.t & keep.v,]




	# Some analysis parameters
	K = 5 # number of strata

	# data size
	n = nrow(dat)

	# remove any rows with missing treatment or outcomes
	# removes any rows that have any missing in the two columns of interest
	keep = which(rowSums(is.na(dat[,c(treatment, outcome)])) == 0)
	# number of used rows
	n.keep = length(keep)
	# count number of dropped rows
	n.na = n - n.keep

	# subset only columns needed (note we drop unused factors)
	if(class(dat[keep, treatment]) != 'factor'){
		dat.treat = as.factor(dat[keep, treatment])
	}else{
		dat.treat = dat[keep, treatment]
	}

	dat.treat = droplevels(dat.treat)

	dat.Yobs = dat[keep, outcome]
	weight = dat[keep, w]

	# create 0-1 indicators by hand to get all possible contrasts
	# there should be: length(levels(dat.treat))*(length(levels(dat.treat)) - 1)/2
	#  treatments in total
	lab = levels(dat.treat)

	# get treatment labels
	# Tmts.labels = apply(comps, 1, function(x) paste(x[1], x[2], sep = ' v. '))
	T = 1*(dat.treat == levels(dat.treat)[2])

	# keep data from sub-experiments, i.e. not NA Treatment
	keep.sub = !is.na(T)
	n.sub = sum(keep.sub)
	
	Yobs.sub = dat.Yobs[!is.na(T)]
	T.sub = T[!is.na(T)]
	weight.sub = weight[!is.na(T)]


	# Standardize to control group standard deviation
	Yobs.sub[T.sub == 0] = Yobs.sub[T.sub == 0]/sd(Yobs.sub[T.sub == 0])
	Yobs.sub[T.sub == 1] = Yobs.sub[T.sub == 1]/sd(Yobs.sub[T.sub == 0])

	ests = calc.txs( Yobs.sub, T.sub, weight.sub, K=K )
	# about 56.928 seconds total (bring this down (?), limit calc.txs)
	SErep = replicate( B, {
		# Creating Treatment Variable, under Null
		Tx <- NULL
		Tx = sample( c(0,1), n.sub, replace=TRUE ) 
		# case resampling
	  	edat.star = sample( n.sub, replace=TRUE )
		calc.txs(Yobs.sub[edat.star], Tx[edat.star], weight.sub[edat.star], K=K )	
	} )

	rs  = melt( SErep )
	rs = subset( rs, Var1 != "Z" & Var1 != "n" )

	res = ddply( rs, .(Var1), summarize,
			n.na = sum( is.na( value ) ),
			mean = mean( value, na.rm=TRUE ),
			SE = sd( value, na.rm=TRUE ) )

	

	# add/remove stuff and reorder
	res$est = ests[1:9]
	res$mean = NULL
	res$n.na = NULL

	ests = list(res = res[,c(1, 3, 2)], dats = ests[10:11])


	ests$dats = c(ests$dats , n.na =  n.na, K = K, weight = w, outcome = outcome, treatment = treatment, treatment_comp = paste(t0, 'v.', t1), subset_var = sub_var, subset_level = sub_level)

	ests$bs = c(cor.star1 = cor(rs[rs$Var1 == 'tau.sate',3], rs[rs$Var1 == 'tau.hh',3], use = 'complete.obs'), cor.star2 = cor(rs[rs$Var1 == 'tau.sate',3], rs[rs$Var1 == 'tau.ps.hh',3], use = 'complete.obs'))

	ests
}


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Given a list of ids, get the number of times each id appears in an experment
#	need ids to be 1:N
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
get_cross_experiments_counts = function(ids){
	exp_labs = matrix(0, nrow = max(unlist(ids)), ncol = length(ids))
	for(i in 1:length(ids)){
		exp_labs[ids[[i]],i] <- 1
	}
	exp_labs
}


